Overview

This code generates the results presented in the paper “Application of cluster analysis to identify different reader groups through their engagement with a digital reading supplement”, encompassing all tables, figures, as well as detailed results in the appendices and supplementary materials.

Table of contents

  1. Load packages needed
  2. Data pre-processing to create the engagement data from log file dataset
  3. Apply five clustering algorithms to the engagement data (including figure 2 - number of clusters; figure 3 - clustering results)
  4. Validation metrics to evaluate algorithms (including Table 1 - validation results)
  5. K-means clustering results (including Figure 4 - Initial literacy ability for each cluster; Figure 5&6 - radar charts; Table S.6 & S.7 for average values of engagement/performance for each cluster)
  6. Exploratory data analysis - Figure S.8 & S.9 usage of Clusters 3,4,and 7 across levels of two skills

Data Structure

Before the analysis, we perform the hierarchical structure of the dataset below.

The diagram below provides a brief idea of the structure of the data and levels of variation we might see.

1. We first load all the necessary packages here.

pkgs <- c('dplyr', 'tidyverse', # data cleaning and data pre-processing, be used to create the engagement data
'ggplot2', # powerful data visualization 
'mclust', # apply the Gaussian graphical models, and examine the BIC (Bayesian Information Criterion) results generated by models
'factoextra', # apply the clustering algorithms, including k-means, k-medoids, clustering large applications, hierarchical clustering, as well as visualization and silhouette coefficients of the clustering results
'fpc', # perform clustering validation metrics including Dunn index and within cluster sum of squares
'extrafont', # into a uniform font e.g. Times New Roman, 12pt, Bold
'readr', # read the dataset
'fmsb'#, # perform radar charts
)
ins <- lapply(pkgs, library, character.only = T) #character.only is for accepting the character of "pkgs"

2. We create the six engagement variables for each skill.

# 1) all hours spent 
all_hours_spent_on_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% # if a student did not finish the game, the elapsed time on the unfinished rounds is excluded
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding", 
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>% # Microcomprehension: comprehension processes
  group_by(user_id, skill) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  group_by(user_id, skill) %>%
  dplyr::summarise(all_hours_spent = sum(elapsed_time_m)/60, # convert mins to hours
            BOY_performance_level, # DIBELS performance level
            gender_complete, 
            is_esl_complete, 
            is_special_ed_complete, 
            race_complete) %>%
  filter(!duplicated(user_id, skill) | n() == 1) %>%
  pivot_wider(., 
              id_cols = c(user_id,
                          BOY_performance_level,
                          gender_complete, 
                          is_esl_complete, 
                          is_special_ed_complete, 
                          race_complete), 
              names_from = skill, 
              names_prefix = "all_hours_spent_on_", 
              values_from = all_hours_spent) 


# 2) variety of games played 
variety_of_games_played_within_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% 
  filter(elapsed_time_m != 0) %>%
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>%
  group_by(user_id, skill) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  dplyr::summarise(variety_of_games = length(unique(game_id))) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "variety_of_games_played_within_", 
              values_from = variety_of_games)


# 3) all attempts 
all_attempts_within_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% 
  filter(elapsed_time_m != 0) %>%
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>%
arrange(round_started_at) %>%
  group_by(user_id, skill) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  distinct() %>%
  dplyr::summarise(all_attempts = length(game_id)) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "all_attempts_within_", 
              values_from = all_attempts)


# 4) the number of levels played 
all_number_of_levels_played_within_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% 
  filter(elapsed_time_m != 0) %>%
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension",)) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  group_by(user_id, skill, game_id) %>%
  distinct() %>%
  dplyr::summarise(game_id, game_level, levels = length(unique(game_level))) %>%
  dplyr::select(user_id, skill, levels) %>%
  distinct() %>%
  group_by(user_id, skill) %>%
  dplyr::summarise(all_number_of_levels_played = sum(levels)) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "all_number_of_levels_played_within_", 
              values_from = all_number_of_levels_played)


# 5) days of playing
days_of_playing_within_skills <- Data_G2 %>% 
  filter(elapsed_time_m != 0) %>%
  filter(!is.na(round_completed_at)) %>% 
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension",)) %>%
  group_by(user_id, skill) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  distinct() %>%
  dplyr::summarise(day = as_date(round_started_at), days_of_playing_within_skills = length(unique(day))) %>%
  filter(row_number() == 1) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "days_of_playing_within_", 
              values_from = days_of_playing_within_skills)

# 6) proportion of early exit attempts
proportion_of_early_exit_rounds_within_skills <- Data_G2 %>% 
  filter(elapsed_time_m != 0) %>%
  filter(!is.na(round_completed_at)) %>%
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension",)) %>%
  group_by(user_id, skill) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  distinct() %>%
  dplyr::summarise(game_id, game_level, all_number_of_rounds = length(game_id), all_number_of_early_exit_rounds = length(which(is_early_exit == "TRUE")), is_early_exit, proportion_of_early_exit_rounds_within_skills = all_number_of_early_exit_rounds/all_number_of_rounds) %>%
  filter(row_number() == 1) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "proportion_of_early_exit_rounds_within_", 
              values_from = proportion_of_early_exit_rounds_within_skills)

# integrate the six variables together for each user
user_engagement <- inner_join(all_hours_spent_on_skills, variety_of_games_played_within_skills, by = "user_id")
user_engagement <- inner_join(user_engagement, all_attempts_within_skills, by = "user_id")
user_engagement <- inner_join(user_engagement, all_number_of_levels_played_within_skills, by = "user_id")
user_engagement <- inner_join(user_engagement, days_of_playing_within_skills, by = "user_id")
user_engagement <- inner_join(user_engagement, proportion_of_early_exit_rounds_within_skills, by = "user_id")

# replace NA by 0 for the engagement variables, if the user have no engagement on the skill
user_engagement1 <- user_engagement %>% mutate_at(c(7:ncol(user_engagement)), funs(replace_na(., 0))) 
length(unique(user_engagement1$user_id)) # there are 19,305 users in total, who will be separated into nine groups
## [1] 19305
write_csv(user_engagement1, "reader_engagement.csv") # this is the engagement data

3. Apply 5 clustering algorithms to the engagement data.

# load the engagement data
user_engagement1 <- read_csv("reader_engagement.csv")

Apply the Gaussian mixture models (GMM) to determine the optimal number of clusters first.

# library("mclust") # we use package "mclust" to apply model-based clustering method: GMM
user_engagement2 <- user_engagement1[, 7:30] # we only need engagement data, thus we exclude any information indicating the users themselves (the social demographics; initial literacy ability)
set.seed(123)
df <- scale(user_engagement2) # standardize across the engagement data, involving the conversion each original value into a z-score.
gmm <- Mclust(df) # apply the Gaussian graphical model (GMM) # it takes a few secs 
user_engagement1$cluster_gmm <- gmm$classification # integrate the clustering label from GMM for each user, named "cluster_gmm"

# performing the "figure 2."
m_clust_11 <- Mclust(df, G=1:11) # The BIC for GMM up to 11 clusters
options(scipen=999) # not scientific notation
par(family = "Times New Roman", font=12) # make the font size 12 and times new roman for any texts shown in the figure
# plot(m_clust_11, "BIC", G = 1:11),legendArgs = list(x = "upleft", ncol = 3), xlab = "The number of clusters") # ignore the error for the legend, since we don't need the information of the legend for each model.
plot(m_clust_11, "BIC", G = 1:11, xlab = "The number of clusters")

As shown in the figure, the BIC curve remained relatively flat beyond nine clusters indicating 9 clusters with different levels of engagement as the best solution.

Apply the rest 4 clustering algorithms: k-means, k-medoids (PAM), CLARA and hierarchical clustering

# library("factoextra") # we use "factoextra" to perform clustering algorithms, including partition clustering methods and hierarchial clustering
set.seed(123)
km_eclust <- eclust(df, "kmeans", 
                    k = 9,  # setting the number of clusters is 9
                    nstart = 50, graph = FALSE)  # apply k-means
user_engagement1$cluster_km_eclust <- km_eclust$cluster # integrate the clustering label from k-means for each user, named "cluster_km_eclust"

set.seed(123)
pam_eclust <- eclust(df, "pam", k = 9, graph = FALSE) # apply k-medoids (partition around medoids (PAM)) # it takes a few secs 
user_engagement1$cluster_pam_eclust <- pam_eclust$cluster # integrate the clustering label from PAM for each user, named "cluster_pam_eclust"

set.seed(123)
clara_eclust <- eclust(df, FUNcluster = "clara", 
                       k = 9, # setting the number of clusters is 9
                       k.max = 10, stand = FALSE, graph = FALSE)  # apply clustering large applications (CLARA)
user_engagement1$cluster_clara_eclust <- clara_eclust$clustering # integrate the clustering label from PAM for each user, named "cluster_clara_eclust"

set.seed(123)
hc_eclust <- eclust(df, "hclust", k = 9, hc_metric = "euclidean",
                 hc_method = "average", graph = FALSE) # apply hierarchical clustering # it takes a few secs 
user_engagement1$cluster_hc_eclust <- hc_eclust$clustering # integrate the clustering label from hierarchical clustering for each user, named "cluster_hc_eclust"

visulization of the clustering results obtained from 5 clustering algorithms

# visualize clustering results - each point represents a student
# library(factoextra) # package "factoextra" are used to visualise the clustering results
fviz_mclust(gmm, ggtheme = theme_minimal(), geom = "point", 
            pointsize = 1.2, ellipse = TRUE, ellipse.type = "convex") +
  theme(text=element_text(family="Times New Roman", size=12))

fviz_mclust(km_eclust, ggtheme = theme_minimal(), geom = "point", 
            pointsize = 1.2, ellipse = TRUE, ellipse.type = "convex") +
  theme(text=element_text(family="Times New Roman", size=12))

fviz_mclust(pam_eclust, ggtheme = theme_minimal(), geom = "point", 
            pointsize = 1.2, ellipse = TRUE, ellipse.type = "convex") +
  theme(text=element_text(family="Times New Roman", size=12))

fviz_mclust(clara_eclust,ggtheme = theme_minimal(), geom = "point", 
            pointsize = 1.2, ellipse = TRUE, ellipse.type = "convex") +
  theme(text=element_text(family="Times New Roman", size=12))

fviz_mclust(hc_eclust, ggtheme = theme_minimal(), geom = "point", 
            pointsize = 1.2, ellipse = TRUE, ellipse.type = "convex") +
  theme(text=element_text(family="Times New Roman", size=12))

4. Cluster validation.

Table 1. Validation results using 3 internal validation metrics for 5 clustering algorithms

## Silhouette coefficient: as higher as better
# Silhouette information can be extracted 
# dunn: as higher as better
# within.cluster.ss: as smaller as better - a cluster that has a small sum of squares is more compact than a cluster that has a large sum of squares.

# library(fpc) # package fpc are used to perform the clustering validation
d <- dist(df)

# Gaussian mixture models (GMM)
cs_gmm = cluster.stats(d, gmm$classification) # it takes a few secs
saveRDS(cs_gmm, file="cs_gmm.RData")
cs_gmm <- readRDS("cs_gmm.RData")
cs_gmm[c("within.cluster.ss","avg.silwidth", "dunn")] # silhouette: -0.0265033 # wcss: 262873.5 # dunn: 0.0006293064
## $within.cluster.ss
## [1] 262873.5
## 
## $avg.silwidth
## [1] -0.0265033
## 
## $dunn
## [1] 0.0006293064
# k-means
cs_km = cluster.stats(d, km_eclust$cluster) # it takes a few secs
saveRDS(cs_km, file="cs_km.RData")
cs_km <- readRDS("cs_km.RData")
cs_km[c("within.cluster.ss","avg.silwidth", "dunn")] # silhouette: 0.165256 # wcss: 181402.3 # dunn: 0.001127537
## $within.cluster.ss
## [1] 181402.3
## 
## $avg.silwidth
## [1] 0.165256
## 
## $dunn
## [1] 0.001127537
# OR a easier/quicker to obtain the silhouette coefficient is "km_eclust$silinfo$avg.width".

# pam
cs_pam = cluster.stats(d, pam_eclust$cluster)
saveRDS(cs_pam, file="cs_pam.RData")
cs_pam <- readRDS("cs_pam.RData")
cs_pam[c("within.cluster.ss","avg.silwidth", "dunn")] # silhouette: 0.1300961 # wcss: 194398.9 # dunn: 0.001865158
## $within.cluster.ss
## [1] 194398.9
## 
## $avg.silwidth
## [1] 0.1300961
## 
## $dunn
## [1] 0.001865158
# OR a easier/quicker to obtain the silhouette coefficient is "pam_eclust$silinfo$avg.width".

# clara
cs_clara = cluster.stats(d, clara_eclust$cluster)
saveRDS(cs_clara, file="cs_clara.RData")
cs_clara <- readRDS("cs_clara.RData")
cs_clara[c("within.cluster.ss","avg.silwidth", "dunn")] # silhouette: 0.1132297 # wcss: 202077.8 # dunn: 0.001141634
## $within.cluster.ss
## [1] 214890.5
## 
## $avg.silwidth
## [1] 0.08503246
## 
## $dunn
## [1] 0.001605344
# OR a easier/quicker to obtain the silhouette coefficient is "clara.res$silinfo$avg.width". # 0.1710795


# hc
cs_hc = cluster.stats(d, hc_eclust$cluster)
saveRDS(cs_hc, file="cs_hc.RData")
cs_hc <- readRDS("cs_hc.RData")
cs_hc[c("within.cluster.ss","avg.silwidth", "dunn")] # silhouette: 0.6747994 # wcss:443703.3 the smaller the more compact # dunn: 0.2131273
## $within.cluster.ss
## [1] 443703.3
## 
## $avg.silwidth
## [1] 0.6747994
## 
## $dunn
## [1] 0.2131273
# OR a easier/quicker to obtain the silhouette coefficient is "hc_eclust$silinfo$avg.width". # 0.6747994

Table S.3 (see supplementary materials) - The average silhouette width for each cohort for each method

## plots and values for silhouette width for each group
fviz_silhouette(km_eclust, palette = "jco", ggtheme = theme_classic())
##   cluster size ave.sil.width
## 1       1 1726          0.19
## 2       2 4862          0.33
## 3       3  248          0.03
## 4       4  571          0.06
## 5       5 2516          0.10
## 6       6 3468          0.12
## 7       7  787          0.05
## 8       8 1126          0.11
## 9       9 4001          0.10

fviz_silhouette(pam_eclust, palette = "jco", ggtheme = theme_classic())
##   cluster size ave.sil.width
## 1       1 2303          0.06
## 2       2 1489          0.13
## 3       3 3182          0.16
## 4       4 2845          0.06
## 5       5 2249          0.09
## 6       6 2368          0.03
## 7       7 3076          0.38
## 8       8  945          0.12
## 9       9  848         -0.06

fviz_silhouette(clara_eclust, palette = "jco", ggtheme = theme_classic())
##   cluster size ave.sil.width
## 1       1 3900          0.03
## 2       2 4972          0.05
## 3       3 3319          0.03
## 4       4  431          0.17
## 5       5 1289          0.16
## 6       6  723          0.12
## 7       7 3423          0.28
## 8       8   42          0.04
## 9       9 1206         -0.10

fviz_silhouette(hc_eclust, palette = "jco", ggtheme = theme_classic())
##   cluster  size ave.sil.width
## 1       1 19277          0.68
## 2       2    13          0.14
## 3       3     1          0.00
## 4       4     1          0.00
## 5       5     5          0.09
## 6       6     4          0.24
## 7       7     2          0.58
## 8       8     1          0.00
## 9       9     1          0.00

#library(cluster) # using cluster package to conduct the silhouette coefficient width for each cohort for Gaussian mixture models (GMM)
#sil<-silhouette(gmm$classification,na.omit(df))
#fviz_silhouette(sil,label=FALSE,print.summary=FALSE)

K-means performed the best out of the five we tested, thus the results from k-means was used for the remaider of our analysis.

5. Clustering results (k-means).

5.1 Figure 4 - Initial literacy ability (DIBELS) of each cluster.

# library(extrafont) # into a uniform font e.g. Times New Roman, 12pt, Bold

# change clustering labels
user_engagement1 <- user_engagement1 %>% mutate(cluster_km_eclust_reorder = case_when(cluster_km_eclust == 1 ~ 1,
                                                                                         cluster_km_eclust == 2 ~ 4,
                                 cluster_km_eclust == 3 ~ 5,
                                 cluster_km_eclust == 4 ~ 2,
                                 cluster_km_eclust == 5 ~ 8,
                                 cluster_km_eclust == 6 ~ 3,
                                 cluster_km_eclust == 7 ~ 9,
                                 cluster_km_eclust == 8 ~ 6,
                                 cluster_km_eclust == 9 ~ 7))
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>% dplyr::summarise(n = length(unique(user_id))) 
## # A tibble: 9 × 2
##   cluster_km_eclust_reorder     n
##                       <dbl> <int>
## 1                         1  1726
## 2                         2   571
## 3                         3  3468
## 4                         4  4862
## 5                         5   248
## 6                         6  1126
## 7                         7  4001
## 8                         8  2516
## 9                         9   787
dibels_graph <- user_engagement1 %>% 
  group_by(cluster_km_eclust_reorder) %>%
  dplyr::reframe(totol = length(user_id), BOY_performance_level) %>%
  ungroup(.) %>%
  group_by(cluster_km_eclust_reorder, BOY_performance_level) %>%
  dplyr::mutate(n = length(BOY_performance_level), p = round(n/totol * 100, digits = 2)) %>%
  distinct(.) %>%
  ggplot(., aes(x = as.factor(cluster_km_eclust_reorder), y = p, group = BOY_performance_level)) + geom_bar(aes(fill = BOY_performance_level), stat = "identity") +
  geom_text(aes(label = p), position = position_stack(vjust = 0.5), size = 3, family = "Times New Roman") + 
  xlab("Clusters") + ylab("Proportion of users (%)") + 
  theme_bw() + theme(panel.grid=element_blank()) +  guides(fill = guide_legend(title = "Beginning-of-Year performance")) + theme(text=element_text(family="Times New Roman", face = "bold", size=12)) + geom_hline(yintercept = c(50,75), linetype = "dashed")
dibels_graph

5.2 Average value of 6 engagement variables (Table S.6 - clustering results of the engagement values for each cluster across four skills).

library(knitr) # to generate table in Rmarkdown
# all hours spent
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_allhours_ED = mean(all_hours_spent_on_Early_Decoding),
                   mean_allhours_PA = mean(all_hours_spent_on_Phonological_Awareness),
                   mean_allhours_V = mean(all_hours_spent_on_Vocabulary),
                   mean_allhours_MC = mean(all_hours_spent_on_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_allhours_ED mean_allhours_PA mean_allhours_V mean_allhours_MC n
1 0.5667382 0.1270921 2.1218601 2.7249362 1726
2 1.1535604 0.4423894 4.6964930 5.4734016 571
3 0.6381883 0.0834815 0.5547715 0.9285886 3468
4 0.2496889 0.0726333 0.1051716 0.0950214 4862
5 6.7180282 1.5383972 4.6222594 5.0404368 248
6 4.0984516 0.4572004 1.5493638 1.6486245 1126
7 0.7123337 0.4235194 0.0493263 0.1681491 4001
8 2.1011235 1.0826231 0.3147816 0.8559454 2516
9 4.9885015 2.2298848 0.8451510 1.5333473 787
# variety of games
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_varietygame_ED = mean(variety_of_games_played_within_Early_Decoding),
                   mean_varietygame_PA = mean(variety_of_games_played_within_Phonological_Awareness),
                   mean_varietygame_V = mean(variety_of_games_played_within_Vocabulary),
                   mean_varietygame_MC = mean(variety_of_games_played_within_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_varietygame_ED mean_varietygame_PA mean_varietygame_V mean_varietygame_MC n
1 1.808806 0.4385863 4.3464658 6.2230591 1726
2 3.618214 1.9387040 5.3957968 8.4483363 571
3 1.990773 0.3143022 2.5158593 3.3641869 3468
4 1.255656 0.4424105 0.7239819 0.5952283 4862
5 7.580645 2.7419355 4.5887097 7.0927419 248
6 7.047957 0.7326821 2.7957371 3.6465364 1126
7 2.244189 1.4306423 0.3196701 1.0427393 4001
8 4.835851 1.5186804 1.3247218 2.4451510 2516
9 6.158831 1.8729352 1.9593393 2.9618806 787
# all attempts 
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_allattempts_ED = mean(all_attempts_within_Early_Decoding),
                   mean_allattempts_PA = mean(all_attempts_within_Phonological_Awareness),
                   mean_allattempts_V = mean(all_attempts_within_Vocabulary),
                   mean_allattempts_MC = mean(all_attempts_within_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_allattempts_ED mean_allattempts_PA mean_allattempts_V mean_allattempts_MC n
1 13.252028 3.122827 56.656431 63.232329 1726
2 30.292469 14.651489 133.478109 134.891419 571
3 14.825548 1.861592 17.225779 20.988754 3468
4 5.190251 1.396133 3.340189 1.822912 4862
5 181.584677 49.625000 155.967742 134.649193 248
6 101.572824 10.438721 48.975133 38.911190 1126
7 13.010497 9.412147 1.774806 3.580105 4001
8 41.335056 23.679253 10.847377 19.901431 2516
9 101.750953 50.381194 28.745870 37.839898 787
# number of levels played
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_nlp_ED = mean(all_number_of_levels_played_within_Early_Decoding),
                   mean_nlp_PA = mean(all_number_of_levels_played_within_Phonological_Awareness),
                   mean_nlp_V = mean(all_number_of_levels_played_within_Vocabulary),
                   mean_nlp_MC = mean(all_number_of_levels_played_within_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_nlp_ED mean_nlp_PA mean_nlp_V mean_nlp_MC n
1 7.125145 1.2560834 23.2902665 22.831982 1726
2 14.551664 5.7197898 41.5236427 37.029772 571
3 7.432526 0.7880623 8.3457324 8.245963 3468
4 2.830111 0.7221308 1.6585767 1.042369 4862
5 53.689516 12.6411290 38.7459677 25.334677 248
6 36.570160 3.2539964 17.2397869 11.484014 1126
7 4.560360 3.4651337 0.8522869 1.785054 4001
8 14.496820 6.4554849 4.3485692 6.974563 2516
9 24.261754 9.5857687 8.1092757 9.139771 787
# days of playing
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_days_ED = mean(days_of_playing_within_Early_Decoding),
                   mean_days_PA = mean(days_of_playing_within_Phonological_Awareness),
                   mean_days_V = mean(days_of_playing_within_Vocabulary),
                   mean_days_MC = mean(days_of_playing_within_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_days_ED mean_days_PA mean_days_V mean_days_MC n
1 5.385863 1.1575898 21.0787949 19.8852839 1726
2 10.530648 5.2626970 36.3957968 33.2644483 571
3 5.791234 0.7964245 7.5126874 7.6355248 3468
4 2.350062 0.7354998 1.5989305 0.8918141 4862
5 43.068548 12.6370968 39.2741935 28.1935484 248
6 29.952931 3.3818828 18.3268206 10.7015986 1126
7 5.295926 3.8427893 0.7495626 1.6670832 4001
8 15.283386 9.0278219 4.4852941 6.6736884 2516
9 31.963151 16.3227446 10.6048285 10.4891995 787
# proportion of early exit rounds
user_engagement1 %>% group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_peer_ED = mean(proportion_of_early_exit_rounds_within_Early_Decoding),
                   mean_peer_PA = mean(proportion_of_early_exit_rounds_within_Phonological_Awareness),
                   mean_peer_V = mean(proportion_of_early_exit_rounds_within_Vocabulary),
                   mean_peer_MC = mean(proportion_of_early_exit_rounds_within_Microcomprehension),
                   n = length(unique(user_id))) %>%
  kable(.)
cluster_km_eclust_reorder mean_peer_ED mean_peer_PA mean_peer_V mean_peer_MC n
1 0.1464497 0.0829156 0.3711045 0.4124860 1726
2 0.2339348 0.2390445 0.3948042 0.4166435 571
3 0.1669274 0.0507244 0.2976649 0.3845385 3468
4 0.0917885 0.0255896 0.1543492 0.0433165 4862
5 0.3699926 0.3403105 0.3912657 0.4643997 248
6 0.3110735 0.1933688 0.3165961 0.3704927 1126
7 0.3570741 0.2912242 0.0585027 0.1088271 4001
8 0.2994077 0.2971046 0.2364290 0.2797030 2516
9 0.3473799 0.3388661 0.3121443 0.3006765 787

5.3 Average value of 4 performance variables (Table S.7 - clustering results of the performance values for each cluster across four skills).

1). Number of levels mastered

# read the log file for Grade 2 students
Data_G2 <- read_csv("Data_G2.csv")
# Number of levels mastered
all_number_of_levels_mastered_within_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% 
  filter(elapsed_time_m != 0) %>%
  filter(is_early_exit == "FALSE") %>%
  filter(is_level_mastered == TRUE) %>% 
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>%
  group_by(user_id, skill, game_id) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  distinct() %>%
  dplyr::summarise(all_number_of_levels_mastered_within_games = length(unique(game_level))) %>%
  group_by(user_id, skill) %>%
  dplyr::summarise(all_number_of_levels_mastered = sum(all_number_of_levels_mastered_within_games)) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "all_number_of_levels_mastered_within_", 
              values_from = all_number_of_levels_mastered)
#18824 users has mastered at least one level within 4 skill families
# 18721 users
summary(all_number_of_levels_mastered_within_skills)
##    user_id          all_number_of_levels_mastered_within_Early_Decoding
##  Length:18721       Min.   :  1.000                                    
##  Class :character   1st Qu.:  3.000                                    
##  Mode  :character   Median :  5.000                                    
##                     Mean   :  9.192                                    
##                     3rd Qu.: 11.000                                    
##                     Max.   :159.000                                    
##                     NA's   :1950                                       
##  all_number_of_levels_mastered_within_Microcomprehension
##  Min.   : 1.000                                         
##  1st Qu.: 2.000                                         
##  Median : 5.000                                         
##  Mean   : 7.799                                         
##  3rd Qu.:10.000                                         
##  Max.   :70.000                                         
##  NA's   :3965                                           
##  all_number_of_levels_mastered_within_Phonological_Awareness
##  Min.   : 1.000                                             
##  1st Qu.: 2.000                                             
##  Median : 3.000                                             
##  Mean   : 4.008                                             
##  3rd Qu.: 5.000                                             
##  Max.   :69.000                                             
##  NA's   :8561                                               
##  all_number_of_levels_mastered_within_Vocabulary
##  Min.   : 1.00                                  
##  1st Qu.: 3.00                                  
##  Median : 6.00                                  
##  Mean   : 9.83                                  
##  3rd Qu.:13.00                                  
##  Max.   :85.00                                  
##  NA's   :6213
user_id <- unique(user_engagement1$user_id)
user_id <- as.data.frame(user_id)
all_number_of_levels_mastered_within_skills <- left_join(user_id, all_number_of_levels_mastered_within_skills, by = "user_id")

all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Phonological_Awareness[is.na(all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Phonological_Awareness)] <- 0
all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Early_Decoding[is.na(all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Early_Decoding)] <- 0
all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Vocabulary[is.na(all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Vocabulary)] <- 0
all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Microcomprehension[is.na(all_number_of_levels_mastered_within_skills$all_number_of_levels_mastered_within_Microcomprehension)] <- 0

left_join(user_engagement1, all_number_of_levels_mastered_within_skills, by = "user_id") %>% 
  group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_nlm_PA = mean(all_number_of_levels_mastered_within_Phonological_Awareness),
                   mean_nlm_ED = mean(all_number_of_levels_mastered_within_Early_Decoding),
                   mean_nlm_V = mean(all_number_of_levels_mastered_within_Vocabulary),
                   mean_nlm_MC = mean(all_number_of_levels_mastered_within_Microcomprehension),
                   n = length(user_id)) %>%
  kable(.)
cluster_km_eclust_reorder mean_nlm_PA mean_nlm_ED mean_nlm_V mean_nlm_MC n
1 1.0133256 6.351101 19.2444959 18.4536501 1726
2 4.6654991 12.814361 35.3222417 30.6182137 571
3 0.6329296 6.482699 6.7560554 6.2730681 3468
4 0.4310983 2.218223 1.3087207 0.8179761 4862
5 10.4959677 45.943548 31.8911290 19.8669355 248
6 2.5994671 31.148313 14.0950266 9.1634103 1126
7 2.1492127 3.015746 0.6730817 1.3636591 4001
8 4.7456280 11.478140 3.3608903 5.4248808 2516
9 7.5641677 19.318933 6.1524778 7.1969504 787

2). Time to mastery

# user_id_master is the user_id for user_engagement1
user_id_master <- unique(all_number_of_levels_mastered_within_skills$user_id)

help <- Data_G2 %>% # 
  filter(user_id %in% user_id_master) %>%
  filter(skill_family == "Early Decoding"| skill_family == "Phonological Awareness" | skill_family == "Vocabulary" | skill_family == "Microcomprehension") %>%
  filter(is_early_exit == "FALSE") %>%
  arrange(round_started_at) %>%
  group_by(user_id, skill_family, game_id, game_level) %>%
  dplyr::summarise(is_level_mastered, elapsed_time_m, round_started_at)

help1 <- help  %>%
  group_by(user_id, skill_family, game_id, game_level, is_level_mastered) %>% 
  dplyr::summarise(n = length(is_level_mastered)) %>%
  pivot_wider(., 
              id_cols = c(user_id, skill_family, game_id, game_level),
              names_from = is_level_mastered,
              values_from = n,
              names_prefix = "n_")
help1
## # A tibble: 497,473 × 7
## # Groups:   user_id, skill_family, game_id, game_level [497,473]
##    user_id                  skill_family game_id game_level n_FALSE n_TRUE  n_NA
##    <chr>                    <chr>        <chr>        <dbl>   <int>  <int> <int>
##  1 000429ab-2a11-4d5e-8a0e… Early Decod… tongue…          1       2      1    NA
##  2 000429ab-2a11-4d5e-8a0e… Early Decod… word_s…          4       1     NA    NA
##  3 000429ab-2a11-4d5e-8a0e… Microcompre… storyb…          4      NA      1    NA
##  4 000429ab-2a11-4d5e-8a0e… Phonologica… all_ab…         13      NA      2    NA
##  5 000429ab-2a11-4d5e-8a0e… Phonologica… all_ab…         14       3      1    NA
##  6 000429ab-2a11-4d5e-8a0e… Phonologica… cut_it…          4      NA      1    NA
##  7 000429ab-2a11-4d5e-8a0e… Phonologica… cut_it…          5      NA      1    NA
##  8 000429ab-2a11-4d5e-8a0e… Phonologica… cut_it…          6       2      1    NA
##  9 000429ab-2a11-4d5e-8a0e… Phonologica… gem_ny…         15       2      1    NA
## 10 000429ab-2a11-4d5e-8a0e… Phonologica… gem_ny…         16       2      1    NA
## # ℹ 497,463 more rows
ttm_separate <- help %>% 
  pivot_wider(., 
              id_cols = c(user_id, skill_family, game_id, game_level),
              names_from = is_level_mastered,
              values_from = elapsed_time_m,
              names_prefix = "is_level_mastered_",
              values_fn = sum)
ttm_not_correct <- left_join(ttm_separate, help1) %>% dplyr::select(user_id, skill_family, game_id, game_level, is_level_mastered_FALSE, is_level_mastered_TRUE, n_FALSE, n_TRUE)
ttm_correct <- ttm_not_correct %>% mutate(is_level_mastered_TRUE_correct_average = is_level_mastered_TRUE / n_TRUE) %>% dplyr::select(user_id, skill_family, game_id, game_level, is_level_mastered_FALSE, is_level_mastered_TRUE_correct_average, n_FALSE, n_TRUE)
ttm_correct$is_level_mastered_FALSE[is.na(ttm_correct$is_level_mastered_FALSE)] <- 0
ttm_correct$is_level_mastered_TRUE_correct_average[is.na(ttm_correct$is_level_mastered_TRUE_correct_average)] <- 0
ttm_game <- ttm_correct %>% mutate(ttm_game_id = is_level_mastered_FALSE + is_level_mastered_TRUE_correct_average)
ttm_game
## # A tibble: 497,473 × 9
## # Groups:   user_id, skill_family, game_id, game_level [497,473]
##    user_id                skill_family game_id game_level is_level_mastered_FA…¹
##    <chr>                  <chr>        <chr>        <dbl>                  <dbl>
##  1 000429ab-2a11-4d5e-8a… Early Decod… tongue…          1                   5.86
##  2 000429ab-2a11-4d5e-8a… Early Decod… word_s…          4                  22.3 
##  3 000429ab-2a11-4d5e-8a… Microcompre… storyb…          4                   0   
##  4 000429ab-2a11-4d5e-8a… Phonologica… all_ab…         13                   0   
##  5 000429ab-2a11-4d5e-8a… Phonologica… all_ab…         14                   9.19
##  6 000429ab-2a11-4d5e-8a… Phonologica… cut_it…          4                   0   
##  7 000429ab-2a11-4d5e-8a… Phonologica… cut_it…          5                   0   
##  8 000429ab-2a11-4d5e-8a… Phonologica… cut_it…          6                   0.74
##  9 000429ab-2a11-4d5e-8a… Phonologica… gem_ny…         15                   4.16
## 10 000429ab-2a11-4d5e-8a… Phonologica… gem_ny…         16                   5.87
## # ℹ 497,463 more rows
## # ℹ abbreviated name: ¹​is_level_mastered_FALSE
## # ℹ 4 more variables: is_level_mastered_TRUE_correct_average <dbl>,
## #   n_FALSE <int>, n_TRUE <int>, ttm_game_id <dbl>
write_csv(ttm_game, "ttm_skill_game_level_user.csv") # will be used later when we generate the exploratory data analysis for Figures 8 & 9 - The number of individuals from Cluster 3, 4, and 7 who played each level of the early decoding/vocabulary games.

 ttm_skill_mean <- 
  ttm_game %>% ungroup() %>%
  group_by(user_id, skill_family) %>%
  dplyr::summarise(ttm = mean(ttm_game_id))
ttm_skill_mean
## # A tibble: 58,171 × 3
## # Groups:   user_id [19,226]
##    user_id                              skill_family             ttm
##    <chr>                                <chr>                  <dbl>
##  1 000429ab-2a11-4d5e-8a0e-89f03fab274a Early Decoding         14.7 
##  2 000429ab-2a11-4d5e-8a0e-89f03fab274a Microcomprehension      2.96
##  3 000429ab-2a11-4d5e-8a0e-89f03fab274a Phonological Awareness  4.26
##  4 0006c480-3121-424d-ab57-8fa03d136eb9 Early Decoding          2.40
##  5 0006c480-3121-424d-ab57-8fa03d136eb9 Microcomprehension      2.96
##  6 0006c480-3121-424d-ab57-8fa03d136eb9 Phonological Awareness 16.1 
##  7 0006c480-3121-424d-ab57-8fa03d136eb9 Vocabulary              2.68
##  8 00087268-62bd-450c-8ab5-94f282cbdf8c Early Decoding          2.88
##  9 00087268-62bd-450c-8ab5-94f282cbdf8c Microcomprehension      6.12
## 10 00087268-62bd-450c-8ab5-94f282cbdf8c Phonological Awareness  2.2 
## # ℹ 58,161 more rows
# pivot_wider
ttm_skill <- ttm_skill_mean %>% 
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill_family, 
              names_prefix = "ttm_within_", 
              values_from = ttm)
summary(ttm_skill)
##    user_id          ttm_within_Early Decoding ttm_within_Microcomprehension
##  Length:19226       Min.   :  0.240           Min.   : 0.600               
##  Class :character   1st Qu.:  3.136           1st Qu.: 3.315               
##  Mode  :character   Median :  4.424           Median : 4.493               
##                     Mean   :  5.693           Mean   : 5.295               
##                     3rd Qu.:  6.651           3rd Qu.: 6.345               
##                     Max.   :111.261           Max.   :61.910               
##                     NA's   :1492              NA's   :3736                 
##  ttm_within_Phonological Awareness ttm_within_Vocabulary
##  Min.   : 0.000                    Min.   : 0.350       
##  1st Qu.: 3.508                    1st Qu.: 2.130       
##  Median : 5.747                    Median : 2.958       
##  Mean   : 7.283                    Mean   : 3.606       
##  3rd Qu.: 9.270                    3rd Qu.: 4.215       
##  Max.   :97.270                    Max.   :74.110       
##  NA's   :7249                      NA's   :6256
# bind the ttm for each user with the cluster label

left_join(user_engagement1,ttm_skill, by = "user_id") %>% 
  group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_ttm_PA = mean(`ttm_within_Phonological Awareness`, na.rm = TRUE),
    mean_ttm_ED = mean(`ttm_within_Early Decoding`, na.rm = TRUE),
                   
                   mean_ttm_V = mean(`ttm_within_Vocabulary`, na.rm = TRUE),
                   mean_ttm_MC = mean(`ttm_within_Microcomprehension`, na.rm = TRUE),
                   n = length(user_id)) %>%
  kable(.)
cluster_km_eclust_reorder mean_ttm_PA mean_ttm_ED mean_ttm_V mean_ttm_MC n
1 4.797642 3.936924 3.985000 5.418564 1726
2 3.168400 3.575509 4.486039 5.880932 571
3 5.635046 4.188277 3.205261 5.243927 3468
4 6.161341 4.615345 3.151626 4.981120 4862
5 4.636009 5.656944 4.569967 7.067588 248
6 6.699979 5.505885 4.220203 5.937653 1126
7 7.071261 7.106238 3.184267 4.768363 4001
8 9.196254 7.445155 3.547691 5.236018 2516
9 12.382490 10.347251 4.945985 6.426812 787

3). Attempts to mastery

all_attempts_within_skills <- Data_G2 %>% 
  filter(!is.na(round_completed_at)) %>% 
  filter(elapsed_time_m != 0) %>%
  filter(is_level_mastered == TRUE) %>% 
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                            skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>%
  group_by(user_id, skill, game_id) %>%
  filter(skill == "Early_Decoding" | skill == "Phonological_Awareness" | skill == "Vocabulary" | skill == "Microcomprehension") %>%
  distinct() %>%
  dplyr::summarise(all_number_of_levels_mastered_within_games = length(unique(game_level))) %>%
  group_by(user_id, skill) %>%
  dplyr::summarise(all_number_of_levels_mastered = sum(all_number_of_levels_mastered_within_games)) %>%
  pivot_wider(., 
              id_cols = user_id, 
              names_from = skill, 
              names_prefix = "all_number_of_levels_mastered_within_", 
              values_from = all_number_of_levels_mastered)

ttm_game$n_FALSE_0 <- ttm_game$n_FALSE
ttm_game$n_FALSE_0[is.na(ttm_game$n_FALSE)] <- 0
ttm_game$attempts_to_mastery <- ttm_game$n_FALSE_0 + 1
attempts_to_mastered_within_skills <- 
  ttm_game %>% ungroup() %>%
  group_by(user_id, skill_family) %>%
  dplyr::summarise(attempts_to_mastery = mean(attempts_to_mastery)) %>%
  mutate(skill = case_when(skill_family == "Early Decoding" ~ "Early_Decoding",
                           skill_family == "Phonological Awareness" ~ "Phonological_Awareness",
                           skill_family == "Vocabulary" ~ "Vocabulary",
                           skill_family == "Microcomprehension" ~ "Microcomprehension")) %>%
  pivot_wider(., 
              id_cols = user_id,
              names_from = skill,
              names_prefix = "attempts_to_mastered_within_",
              values_from = attempts_to_mastery) 
summary(attempts_to_mastered_within_skills)
##    user_id          attempts_to_mastered_within_Early_Decoding
##  Length:19226       Min.   : 1.000                            
##  Class :character   1st Qu.: 1.222                            
##  Mode  :character   Median : 1.571                            
##                     Mean   : 1.822                            
##                     3rd Qu.: 2.125                            
##                     Max.   :13.625                            
##                     NA's   :1492                              
##  attempts_to_mastered_within_Microcomprehension
##  Min.   : 1.000                                
##  1st Qu.: 1.167                                
##  Median : 1.500                                
##  Mean   : 1.683                                
##  3rd Qu.: 2.000                                
##  Max.   :25.500                                
##  NA's   :3736                                  
##  attempts_to_mastered_within_Phonological_Awareness
##  Min.   : 1.000                                    
##  1st Qu.: 1.500                                    
##  Median : 2.000                                    
##  Mean   : 2.439                                    
##  3rd Qu.: 3.000                                    
##  Max.   :15.000                                    
##  NA's   :7249                                      
##  attempts_to_mastered_within_Vocabulary
##  Min.   : 1.000                        
##  1st Qu.: 1.182                        
##  Median : 1.500                        
##  Mean   : 1.673                        
##  3rd Qu.: 2.000                        
##  Max.   :10.500                        
##  NA's   :6256
# bind atm with cluster label

left_join(user_engagement1,attempts_to_mastered_within_skills, by = "user_id") %>% 
  group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_atm_PA = mean(attempts_to_mastered_within_Phonological_Awareness, na.rm = TRUE),
    mean_atm_ED = mean(attempts_to_mastered_within_Early_Decoding, na.rm = TRUE),
                   
                   mean_atm_V = mean(attempts_to_mastered_within_Vocabulary, na.rm = TRUE),
                   mean_atm_MC = mean(attempts_to_mastered_within_Microcomprehension, na.rm = TRUE),
                   n = length(user_id)) %>%
  kable(.)
cluster_km_eclust_reorder mean_atm_PA mean_atm_ED mean_atm_V mean_atm_MC n
1 1.723510 1.399759 1.551082 1.537733 1726
2 1.375458 1.363598 1.608074 1.753840 571
3 1.990870 1.471671 1.529100 1.634112 3468
4 2.283866 1.711988 1.526006 1.629695 4862
5 1.774620 2.005690 2.090011 2.111692 248
6 2.164633 1.890904 1.995410 1.819961 1126
7 2.446027 2.072551 1.670462 1.584906 4001
8 2.845693 2.094842 1.791405 1.756719 2516
9 3.577962 2.795040 2.316668 2.132378 787

4). Speed of mastery

spm <- left_join(user_engagement1, all_number_of_levels_mastered_within_skills, by = "user_id") %>%
  dplyr::group_by(user_id) %>%
  dplyr::summarise(spm_PA = 
                     all_number_of_levels_mastered_within_Phonological_Awareness/all_hours_spent_on_Phonological_Awareness,
                   spm_ED = 
      all_number_of_levels_mastered_within_Early_Decoding/all_hours_spent_on_Early_Decoding,
                   spm_V = all_number_of_levels_mastered_within_Vocabulary/all_hours_spent_on_Vocabulary,
                   spm_MC = all_number_of_levels_mastered_within_Microcomprehension/all_hours_spent_on_Microcomprehension)
# calculate those who didn't master any level within one skill
summary(spm)
##    user_id              spm_PA            spm_ED            spm_V        
##  Length:19305       Min.   :  0.000   Min.   :  0.000   Min.   :  0.000  
##  Class :character   1st Qu.:  2.335   1st Qu.:  4.461   1st Qu.:  7.032  
##  Mode  :character   Median :  5.112   Median :  8.464   Median : 11.472  
##                     Mean   : 10.422   Mean   :  9.755   Mean   : 13.686  
##                     3rd Qu.:  9.569   3rd Qu.: 13.683   3rd Qu.: 17.510  
##                     Max.   :545.455   Max.   :157.895   Max.   :171.429  
##                     NA's   :7193      NA's   :1408      NA's   :6077     
##      spm_MC       
##  Min.   :  0.000  
##  1st Qu.:  4.743  
##  Median :  7.447  
##  Mean   :  8.625  
##  3rd Qu.: 11.194  
##  Max.   :100.000  
##  NA's   :3676
spm$spm_ED[is.nan(spm$spm_ED)] <- 0
spm$spm_PA[is.nan(spm$spm_PA)] <- 0
spm$spm_V[is.nan(spm$spm_V)] <- 0
spm$spm_MC[is.nan(spm$spm_MC)] <- 0

# bind spm with cluster label

left_join(user_engagement1,spm, by = "user_id") %>% 
  group_by(cluster_km_eclust_reorder) %>%
  dplyr::summarise(mean_stm_PA = mean(spm_PA),
    mean_stm_ED = mean(spm_ED),
                   
                   mean_stm_V = mean(spm_V),
                   mean_stm_MC = mean(spm_MC),
                   n = length(user_id)) %>%
  kable(.)
cluster_km_eclust_reorder mean_stm_PA mean_stm_ED mean_stm_V mean_stm_MC n
1 7.516021 13.310220 10.205932 7.626285 1726
2 16.737954 13.857169 8.592301 6.360358 571
3 5.390256 12.299634 14.594792 7.775597 3468
4 6.854270 9.421934 9.426594 5.110182 4862
5 9.825252 7.886379 8.310795 4.621455 248
6 5.250087 8.467124 11.135245 6.881212 1126
7 6.730873 5.879809 4.354098 8.009630 4001
8 5.279588 6.586768 9.332740 8.058813 2516
9 3.961738 4.629472 8.342615 6.327925 787

5.4 Figure 5: Radar charts for phonological awareness and early decoding.

# library(fmsb) # we use "fmsb" package to perform radar chart

# based on the ranking results for each variable for each cluster, we use the radar chart to show the informative engagement information

C1 <- as.data.frame(matrix(c(7,7, 3,2, 3,2, 7,7, 2,2), ncol = 5)) # the numbers filled in are the ranking results for each dimension, counterclockwise
colnames(C1) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C1) <- c("Phonological Awareness", "Early Decoding")
C_1 <- rbind(rep(9, 5), rep(1, 5), C1)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_1, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
           title = "Cluster 1 (n = 1726, 8.94%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5#size of value labels
           )

C2 <- as.data.frame(matrix(c(4,5, 1,1, 1,1, 4,4, 1,1), ncol = 5))
colnames(C2) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C2) <- c("PA", "ED")
C_2 <- rbind(rep(9, 5), rep(1, 5), C2)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_2, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
            title = "Cluster 2 (n = 571, 2.96%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C3 <- as.data.frame(matrix(c(8,6, 4,3, 6,3, 8,6, 4,3), ncol = 5))
colnames(C3) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C3) <- c("PA", "ED")
C_3 <- rbind(rep(9, 5), rep(1, 5), C3)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_3, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
           title = "Cluster 3 (n = 3468, 17.96%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C4 <- as.data.frame(matrix(c(9,9, 5,4, 4,4, 9,9, 6,4), ncol = 5))
colnames(C4) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C4) <- c("PA", "ED")
C_4 <- rbind(rep(9, 5), rep(1, 5), C4)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_4, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
           title = "Cluster 4 (n = 4862, 25.19%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C5 <- as.data.frame(matrix(c(2,1, 2,6, 2,6, 1,1, 3,6), ncol = 5))
colnames(C5) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C5) <- c("PA", "ED")
C_5 <- rbind(rep(9, 5), rep(1, 5), C5)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_5, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
          title = "Cluster 5 (n = 248, 1.28%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C6 <- as.data.frame(matrix(c(6,3, 6,5, 8,5, 5,2, 5,5), ncol = 5))
colnames(C6) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C6) <- c("PA", "ED")
C_6 <- rbind(rep(9, 5), rep(1, 5), C6)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_6, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
          title = "Cluster 6 (n = 1126, 5.83%)",  
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C7 <- as.data.frame(matrix(c(5,8, 7,7, 5,8, 6,8, 7,7), ncol = 5))
colnames(C7) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C7) <- c("PA", "ED")
C_7 <- rbind(rep(9, 5), rep(1, 5), C7)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_7, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
         title = "Cluster 7 (n = 4001, 20.73%)",  
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C8 <- as.data.frame(matrix(c(3,4, 8,8, 7,7, 3,5, 8,8), ncol = 5))
colnames(C8) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C8) <- c("PA", "ED")
C_8 <- rbind(rep(9, 5), rep(1, 5), C8)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_8, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
          title = "Cluster 8 (n = 2516, 13.03%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C9 <- as.data.frame(matrix(c(1,2, 9,9, 9,9, 2,3, 9,9), ncol = 5))
colnames(C9) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C9) <- c("Phonological Awareness", "Early Decoding")
C_9 <- rbind(rep(9, 5), rep(1, 5), C9)
colors_in <- c("#64B8FF", "#00008B")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12)
radarchart(C_9, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C1),
           pcol = colors_in,
         title = "Cluster 9 (n = 787, 4.08%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

5.5 Figure 6: Radar charts for vocabulary and comprehension processes

# V,MC
C1 <- as.data.frame(matrix(c(3,3, 5,5, 3,4, 3,3, 3,1), ncol = 5))
colnames(C1) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C1) <- c("V", "MC")
C_1 <- rbind(rep(9, 5), rep(1, 5), C1)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_1, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
         title = "Cluster 1 (n = 1726, 8.94%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C2 <- as.data.frame(matrix(c(1,1, 7,6, 6,6, 1,1, 4,5), ncol = 5))
colnames(C2) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C2) <- c("V", "MC")
C_2 <- rbind(rep(9, 5), rep(1, 5), C2)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_2, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
         title = "Cluster 2 (n = 571, 2.96%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C3 <- as.data.frame(matrix(c(6,6, 3,4, 3,4, 5,6, 2,4), ncol = 5))
colnames(C3) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C3) <- c("V", "MC")
C_3 <- rbind(rep(9, 5), rep(1, 5), C3)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_3, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
         title = "Cluster 3 (n = 3468, 17.96%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C4 <- as.data.frame(matrix(c(8,9, 1,2, 4,8, 8,9, 1,3), ncol = 5))
colnames(C4) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C4) <- c("V", "MC")
C_4 <- rbind(rep(9, 5), rep(1, 5), C4)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_4, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 4 (n = 4862, 25.19%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C5 <- as.data.frame(matrix(c(2,2, 8,9, 8,9, 2,2, 8,8), ncol = 5))
colnames(C5) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C5) <- c("V", "MC")
C_5 <- rbind(rep(9, 5), rep(1, 5), C5)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_5, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 5 (n = 248, 1.28%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C6 <- as.data.frame(matrix(c(4,4, 6,7, 2,5, 4,4, 7,7), ncol = 5))
colnames(C6) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C6) <- c("V", "MC")
C_6 <- rbind(rep(9, 5), rep(1, 5), C6)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_6, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 6 (n = 1126, 5.83%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C7 <- as.data.frame(matrix(c(9,8, 2,1, 9,2, 9,8, 5,2), ncol = 5))
colnames(C7) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C7) <- c("V", "MC")
C_7 <- rbind(rep(9, 5), rep(1, 5), C7)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_7, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 7 (n = 4001, 20.73%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C8 <- as.data.frame(matrix(c(7,7, 4,3, 5,1, 7,7, 6,6), ncol = 5))
colnames(C8) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C8) <- c("V", "MC")
C_8 <- rbind(rep(9, 5), rep(1, 5), C8)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_8, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 8 (n = 2516, 13.03%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

C9 <- as.data.frame(matrix(c(5,5, 9,8, 7,7, 6,5, 9,9), ncol = 5))
colnames(C9) <- c("engagement", "time", "speed", "levels", "attempts")
rownames(C9) <- c( "V", "MC")
C_9 <- rbind(rep(9, 5), rep(1, 5), C9)
colors_in <- c("#FFA500","#FF0000")
par(mar = c(1, 2, 2, 1), family = "Times New Roman", font=12, face = "bold")
radarchart(C_9, 
           caxislabels = c(1,2,3,4,5,6,7,8,9),
           seg = 8,
           axistype = 3,
           axislabcol = "grey", 
           vlabels = colnames(C_9),
           pcol = colors_in,
           title = "Cluster 9 (n = 787, 4.08%)", 
           cex.main = 2,
           plwd = 4,
           plty = 1,
           vlcex = 2,
           palcex = 1.5
)

6. Exploratory Data Analysis - figure S.8 and S.9 (see supplementary materials)

ttm changes refer to levels - graphic way

ttm_game <- read_csv("ttm_skill_game_level_user.csv") # directly use the dataset obtained when we calculating time to mastery 
ttm_label <- left_join(ttm_game, user_engagement1[, c("user_id", "cluster_km_eclust_reorder")], by = "user_id") 
ttm_label$cluster_km_eclust_reorder <- as.character(ttm_label$cluster_km_eclust_reorder)
ttm_label$game_level <- as.factor(ttm_label$game_level)

PA <- ttm_label %>% filter(skill_family == "Phonological Awareness")
ED <- ttm_label %>% filter(skill_family == "Early Decoding") 
V <- ttm_label %>% filter(skill_family == "Vocabulary")
MC <- ttm_label %>% filter(skill_family == "Microcomprehension") # comprehension processes

ED %>%
  mutate(level = as.numeric(game_level)) %>%
  filter(cluster_km_eclust_reorder == 3 | 
           cluster_km_eclust_reorder == 4 |
           cluster_km_eclust_reorder == 7) %>%
  group_by(level, cluster_km_eclust_reorder) %>%
  dplyr::summarise(number_of_users = length(unique(user_id)) ) %>%
  ggplot(.) + geom_bar(aes(x = level, y = number_of_users), stat = "identity") + facet_wrap(~ cluster_km_eclust_reorder) + scale_x_continuous(breaks = seq(0, 200, 20)) + theme_light() + xlab("Game level") + ylab("Number of users")+
  theme(text=element_text(family="Times New Roman", size=12))

V %>%
  mutate(level = as.numeric(game_level)) %>%
  filter(cluster_km_eclust_reorder == 3 | 
           cluster_km_eclust_reorder == 4 |
           cluster_km_eclust_reorder == 7) %>%
  group_by(level, cluster_km_eclust_reorder) %>%
  dplyr::summarise(number_of_users = length(unique(user_id)) ) %>%
  ggplot(.) + geom_bar(aes(x = level, y = number_of_users), stat = "identity") + facet_wrap(~ cluster_km_eclust_reorder) + scale_x_continuous(breaks = seq(0, 50, 10)) + theme_light() + xlab("Game level") + ylab("Number of users")+
  theme(text=element_text(family="Times New Roman", size=12))